# Calculate/load Differential Expression metrics for all genes, load SFARI dataset:

# Gene expression data
load('./../working_data/RNAseq_ASD_4region_normalized.Rdata')

# SFARI genes
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
SFARI_genes = SFARI_genes %>% inner_join(datProbes, by=c('gene-symbol'='external_gene_id')) %>%
              mutate('ID' = ensembl_gene_id) %>%
              dplyr::select(ID, `gene-score`, syndromic)

# Balance Groups by covariates, remove singular batches (none)
to_keep = (datMeta$Subject_ID != 'AN03345') & !is.na(datMeta$Dx)
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
  
if(!file.exists('./../working_data/genes_ASD_DE_info.csv')) {
  
  # Calculate differential expression for ASD
  mod = model.matrix(~ Dx, data=datMeta)
  corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
  lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
  
  fit = eBayes(lmfit, trend=T, robust=T)
  top_genes = topTable(fit, coef=2, number=nrow(datExpr))
  genes_DE_info = top_genes[match(rownames(datExpr), rownames(top_genes)),] %>%
                  mutate('ID'=rownames(datExpr)) %>% left_join(SFARI_genes, by='ID')
  
  write_csv(genes_DE_info, path='./../working_data/genes_ASD_DE_info.csv')
  
  rm(mod, corfit, lmfit, fit, top_genes)
} else {
genes_DE_info = read_csv('./../working_data/genes_ASD_DE_info.csv')
}

rm(to_keep, datSeq, datProbes)

Scatter plot of genes coloured by score

Genes with an autism score don’t seem to have better behaviour than the rest of the genes:

genes_DE_info = genes_DE_info %>% mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))

# Create subsample of genes without score so plot is not that heavy
genes_DE_info_score = genes_DE_info %>% filter(`gene-score`!='None')
genes_DE_info_sample = genes_DE_info %>% filter(`gene-score`=='None') %>% 
  sample_frac(0.25) %>% bind_rows(genes_DE_info_score)

ggp = ggplotly(genes_DE_info_sample %>% ggplot(aes(adj.P.Val, logFC, color=`gene-score`)) + 
               geom_point(aes(ids=ID, alpha=`gene-score`)) + scale_alpha_discrete(range=c(1, 0.2)) + 
               geom_vline(aes(xintercept=0.03), color='#666666') + scale_color_manual(values=gg_colour_hue(7)) + 
               theme_minimal())
rangeslider(ggp, start=0, end=1.05)
rm(genes_DE_info_score, genes_DE_info_sample, ggp)

Boxplots of the log2 fold change by score

ggplotly(genes_DE_info %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) + geom_boxplot() + 
                     scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Perhaps there was an error in the calculation of the differential expression?

Comparing expression density between top and least DE genes

Looks OK…

top_gene_id = genes_DE_info$ID[which.max(abs(genes_DE_info$logFC))]
bottom_gene_id = genes_DE_info$ID[which.min(abs(genes_DE_info$logFC))]
top_bottom_gene_expr = data.frame('Expr_top'=datExpr[rownames(datExpr)==top_gene_id,], 
                                  'Expr_bottom'=datExpr[rownames(datExpr)==bottom_gene_id,],
                                  'Diagnosis'=datMeta$Diagnosis)
p1 = top_bottom_gene_expr %>% ggplot(aes(Expr_bottom, color=Diagnosis, fill=Diagnosis)) + 
                                     geom_density(alpha=0.4) + theme_minimal()
p2 = top_bottom_gene_expr %>% ggplot(aes(Expr_top, color=Diagnosis, fill=Diagnosis)) + 
                                     geom_density(alpha=0.4) + theme_minimal()

grid.arrange(p1, p2, ncol=2)

rm(top_gene_id, bottom_gene_id, top_bottom_gene_expr, p1, p2)

Comparing expression levels between different scores

Boxplots of the average difference in mean between diagnosis by score

Same as with the boxplots for log fold change

make_ASD_vs_CTL_df = function(datExpr){
  datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
  datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))
  
  ASD_vs_CTL = data.frame('ID'=rownames(datExpr),
                          'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
                          'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
               mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
               left_join(SFARI_genes, by='ID') %>%
               dplyr::select(ID, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
               mutate('gene-score'=ifelse(is.na(`gene-score`),'None',`gene-score`))
  
  return(ASD_vs_CTL)
}

ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + geom_boxplot() +
               scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal()

# ASD_vs_CTL %>% ggplot(aes(`gene-score`, sd_diff, fill=`gene-score`)) + geom_boxplot() +
#               scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal()

Density plot of the average difference in mean between diagnosis by score

Score 1 outlier has ID = ENSG00000136535 (TBR1), it’s the same one that almost made the cut in the scatter plot

ggplotly(ASD_vs_CTL %>% ggplot(aes(abs(mean_diff), color=`gene-score`, fill=`gene-score`)) + 
                        geom_density(alpha=0.3) + theme_minimal() + 
                        scale_fill_manual(values=gg_colour_hue(7)) + 
                        scale_color_manual(values=gg_colour_hue(7)))

Density plot of the average difference in standard deviation between diagnosis by score

Score 1 outlier is the same one as in the previous plot

ggplotly(ASD_vs_CTL %>% ggplot(aes(abs(sd_diff), color=`gene-score`, fill=`gene-score`)) +
                        geom_density(alpha=0.3) + theme_minimal() +
                        scale_fill_manual(values=gg_colour_hue(7)) +
                        scale_color_manual(values=gg_colour_hue(7)))

Perhaps the error was when performing normalisation?

Comparing expression levels between different scores using raw data and matching IDs to SFARI’s using biomaRt

Loading raw data, modifying metadata and SFARI genes manually and filtering genes

# Load csvs
datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')

# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
if(!all(colnames(datExpr) == rownames(datMeta))){
  print('Columnd in datExpr don\'t match the rowd in datMeta!')
}

# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position

#################################################################################
# FILTERS:

# 1 Filter probes with start or end position missing (filter 5)
# These can be filtered without probe info, they have weird IDS that don't start with ENS
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id

# 3. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

#################################################################################
# Annotate SFARI genes

# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19

gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'), 
                   values=SFARI_genes$`gene-symbol`, mart=mart) %>% 
                   mutate('gene-symbol'=hgnc_symbol, 'ID'=ensembl_gene_id) %>% 
                   dplyr::select('ID', 'gene-symbol')

SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')

datExpr_backup = datExpr

rm(getinfo, to_keep, gene_names, mart)

Boxplot of difference in mean between diagnosis by score

  • Score 1 has the highest median, all scores have higher median than the rest of the genes
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Applying log2 transformation to the unnormalised data

Genes labeled with Score 1 and 2 are the most affected, now their medians are just above the median of the genes without a score, as well as their 3rd quantile. (Does this make sense? Log2 is a monotonic transformation, why would it affect different distributions in a non monotonic way?)

ASD_vs_CTL = make_ASD_vs_CTL_df(log2(datExpr_backup+1))

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Performing Quantile Normalisation

Now both the median and the 3rd quantile of the genes with score 1 and 2 are below the general median of the data without score

datExpr = datExpr_backup # Should be the same, but in case code is ran in different order...

# Check datProbes and datExpr have same rows
if(!all(rownames(datExpr) == rownames(datProbes))){
  print('Rows in datExpr don\'t match the ones in datProbes!')
}

# Conditional Quantile Normalisation (CQN)
cqn.dat = cqn(datExpr, lengths = datProbes$length,
              x = datProbes$percentage_gc_content,
              lengthMethod = 'smooth',
              sqn = FALSE)  # Run cqn with specified depths and with no quantile normalisation
datExpr = cqn.dat$y + cqn.dat$offset  # Get the log2(Normalised RPKM) values

Boxplot of difference in mean between diagnosis by score

Scores 1 and 2 now have the same median as the genes without score and a lower 3rd quantile

ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Filtering genes with low counts

# Filter out genes with low counts (filter 43406)
pres = apply(datExpr>1, 1, sum) 
to_keep = pres > 0.5 * ncol(datExpr)
datExpr = datExpr[to_keep,]

datExpr_post_Norm = datExpr

Boxplot of difference in mean between diagnosis by score

Genes with score 1 and 2 now have lower medians and 3rd quantiles than the genes withouth a score

ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())


Other Normalisation techniques?

Using DESeq2, with the normalized option in counts and applying log2 transformation manually

datExpr = datExpr_backup

counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
                    IRanges(datProbes$start_position, width=datProbes$length),
                    strand=datProbes$strand,
                    feature_id=datProbes$ensembl_gene_id)

se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis_+Region)

#Estimate size factors
dds = estimateSizeFactors(dds)

# Plot column sums according to size factor
# plot(sizeFactors(dds), colSums(counts(dds)))
# abline(lm(colSums(counts(dds)) ~ sizeFactors(dds) + 0))

datExpr = log2(counts(dds, normalized=TRUE) + 1)

# Filter out genes with low counts (filter 35303)
pres = apply(datExpr>1, 1, sum) 
to_keep = pres > 0.5 * ncol(datExpr)
datExpr = datExpr[to_keep,]

Boxplot of difference in mean between diagnosis by score

Same result as before …

ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Using vst (variance stabilisation normalisation) from DESeq2

vst_output = vst(dds)
datExpr = assay(vst_output)

# Filter out genes with low counts (filter 0)
pres = apply(datExpr>1, 1, sum)
to_keep = pres > 0.5 * ncol(datExpr)
datExpr = datExpr[to_keep,]

Boxplot of difference in mean between diagnosis by score

A bit better, at least the median and 3rd quantiles of genes with score are higher than genes without a score, but the distinction was better before performing the log2 transformation

ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

ggplotly(ASD_vs_CTL %>% ggplot(aes(`gene-score`, abs(mean_diff), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(7)) + theme_minimal())

Compare expression densities for Score 1 genes before and after normalisation (cqn normalisation)

Before:

ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_backup)
## Warning: Column `ID` joining factor and character vector, coercing into
## character vector
ggp = ggplotly(ASD_vs_CTL %>% filter(`gene-score`!='None') %>% 
               ggplot(aes(abs(mean_diff), color=`gene-score`, fill=`gene-score`)) + 
                      geom_density(alpha=0.3) + theme_minimal() + 
                      scale_fill_manual(values=gg_colour_hue(7)) + 
                      scale_color_manual(values=gg_colour_hue(7)))
rangeslider(ggp, start=0, end=6000)

After:

ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_post_Norm)

ggp = ggplotly(ASD_vs_CTL %>% ggplot(aes(abs(mean_diff), color=`gene-score`, fill=`gene-score`)) + 
               geom_density(alpha=0.3) + theme_minimal() + 
               scale_fill_manual(values=gg_colour_hue(7)) + 
               scale_color_manual(values=gg_colour_hue(7)))
rangeslider(ggp, start=0, end=3)